DailyExercise22_CacheLaPoudre_Day2

Author

Clara Jordan

Daily Exercise 21

Download Data

library(dataRetrieval)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.3.0     ✔ tune         1.2.1
✔ infer        1.0.7     ✔ workflows    1.1.4
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.2.1     ✔ yardstick    1.3.1
✔ recipes      1.1.0     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Dig deeper into tidy modeling with R at https://www.tmwr.org
library(dplyr)
library(tsibble)
Registered S3 method overwritten by 'tsibble':
  method               from 
  as_tibble.grouped_df dplyr

Attaching package: 'tsibble'

The following object is masked from 'package:lubridate':

    interval

The following objects are masked from 'package:base':

    intersect, setdiff, union
# Example: Cache la Poudre River at Mouth (USGS site 06752260)
poudre_flow <- readNWISdv(siteNumber = "06752260",    
                          # Download data from USGS for site 06752260
                          parameterCd = "00060",      
                          # Parameter code 00060 = discharge in cfs)
                          startDate = "2013-01-01",   
                          # Set the start date
                          endDate = "2023-12-31") |>
  # Set the end date
  renameNWISColumns() |>                              
  # Rename columns to standard names (e.g., "Flow", "Date")
  mutate(Date = yearmonth(Date)) |>                   
  # Convert daily Date values into a year-month format (e.g., "2023 Jan")
  group_by(Date) |>                                   
  # Group the data by the new monthly Date
  summarise(Flow = mean(Flow))
GET:https://waterservices.usgs.gov/nwis/dv/?site=06752260&format=waterml%2C1.1&ParameterCd=00060&StatCd=00003&startDT=2013-01-01&endDT=2023-12-31
  # Calculate the average daily flow for each month

1. Convert to tsibble

# Convert to tsibble (monthly data, with Date as index)
poudre_ts <- poudre_flow |> 
  as_tsibble(index = Date)

2. Plotting the time series

library(ggplot2)
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
# Basic ggplot of flow over time
p <- ggplot(poudre_ts, aes(x = Date, y = Flow)) +
  geom_line(color = "blue") +
  labs(title = "Monthly Average Streamflow - Cache la Poudre River",
       y = "Flow (cfs)", x = "Date") +
  theme_minimal()

# Animate with plotly
ggplotly(p)

3. Subseries

library(feasts)
Loading required package: fabletools

Attaching package: 'fabletools'
The following object is masked from 'package:yardstick':

    accuracy
The following object is masked from 'package:parsnip':

    null_model
The following objects are masked from 'package:infer':

    generate, hypothesize
# Subseries plot to show seasonal patterns
poudre_ts |> 
  gg_subseries(Flow)

4. Decompose

library(fabletools)

# STL decomposition with a seasonal window (say 13 months for annual seasonality)
decomp <- poudre_ts |> 
  model(STL(Flow ~ season(window = "periodic"))) |> 
  components()

# Plot the components
autoplot(decomp)

Daily Exercise 22

1. Use modeltime to forecast the next 12 months of streamflow data in the Poudre River based on last time assignment.

library(modeltime)
library(timetk)
library(tidymodels)
library(lubridate)

# Make sure Date is a date object
poudre_ts <- poudre_ts |>
  mutate(Date = as.Date(Date)) 

# Split data into training (all except last year)
splits <- initial_time_split(poudre_ts, prop = 0.92)  # Approx. up to end of 2022

# Check split
training(splits) %>% tail()
# A tsibble: 6 x 2 [1D]
  Date        Flow
  <date>     <dbl>
1 2022-08-01 59.9 
2 2022-09-01 18.5 
3 2022-10-01 69.1 
4 2022-11-01 11.2 
5 2022-12-01 10.4 
6 2023-01-01  7.29
testing(splits) %>% head()
# A tsibble: 6 x 2 [1D]
  Date         Flow
  <date>      <dbl>
1 2023-02-01   21.9
2 2023-03-01   28.8
3 2023-04-01   76.8
4 2023-05-01  648. 
5 2023-06-01 1499. 
6 2023-07-01  188. 

2. Use the prophet_reg(), and arima_reg() function to create a Prophet model for forecasting.

library(parsnip)
library(lubridate)
library(dplyr)
library(tidymodels)

# Prophet model
prophet_model <- prophet_reg() %>%
  set_engine("prophet")

# ARIMA model
arima_model <- arima_reg() %>%
  set_engine("auto_arima")

# Fit models
model_tbl <- modeltime_table(
  fit(prophet_model, Flow ~ Date, data = training(splits)),
  fit(arima_model, Flow ~ Date, data = training(splits))
)
Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
frequency = 12 observations per 1 year

3. Use dataRetrieval to download daily streamflow for the next 12 months. Aggregate this data to monthly averages and compare it to the predictions made by your modeltime model.

poudre_2024 <- readNWISdv(
  siteNumber = "06752260",
  parameterCd = "00060",
  startDate = "2024-01-01",
  endDate = "2024-12-31"
) |> 
  renameNWISColumns() |> 
  mutate(Date = as.Date(Date)) |> 
  mutate(Date = floor_date(Date, "month")) |> 
  group_by(Date) |> 
  summarise(Flow = mean(Flow, na.rm = TRUE), .groups = "drop") 
GET:https://waterservices.usgs.gov/nwis/dv/?site=06752260&format=waterml%2C1.1&ParameterCd=00060&StatCd=00003&startDT=2024-01-01&endDT=2024-12-31
poudre_2024 <- poudre_2024 %>%
  rename(observed_cfs = Flow)

# Refit models on full data
refit_tbl <- model_tbl |>
  modeltime_refit(poudre_ts)
Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
frequency = 12 observations per 1 year
# Create 12 months of future dates
last_date <- max(poudre_ts$Date)
future_dates <- tibble(Date = seq.Date(from = last_date %m+% months(1), by = "month", length.out = 12))

# Forecast future data (12 months)
future_tbl <- testing(splits)

forecast_tbl <- refit_tbl |>
  modeltime_forecast(new_data = future_dates, actual_data = poudre_ts)
Error: Column `Date` (index) must not contain `NA`.
Warning: Unknown or uninitialised column: `.key`.
# Aggregate forecasts to monthly means
forecast_monthly <- forecast_tbl |>
  filter(.key == "prediction") |>
  mutate(month = floor_date(.index, "month")) |>
  group_by(month) |>
  summarise(predicted_cfs = mean(.value, na.rm = TRUE), .groups = "drop") |>
  rename(Date = month)

# RJoin observed and predicted data 
comparison_tbl <- left_join(forecast_monthly, poudre_2024, by = "Date") |>
  filter(!is.na(observed_cfs), !is.na(predicted_cfs))

4. Compute the R2 value between the model predictions and the observed data using a linear model and report the meaning.

# Compute R2 using linear model
if(nrow(comparison_tbl) > 1) {
  r2_model <- lm(observed_cfs ~ predicted_cfs, data = comparison_tbl)
  r2_value <- summary(r2_model)$r.squared
  cat("R2 between predictions and observed values:", round(r2_value, 3), "\n")
} else {
  cat("Not enough overlapping data to compute R2 - check monthly alignment.\n")
}
R2 between predictions and observed values: 0.919 

With this we can see that the R^2 value between predictions and observed values is 0.919. This indicates a strong positive correlation between actual and predicted values and that our model is reliable given the domain of this data.

5. Last, generate a plot of the Predicted vs Observed values and include a 1:1 line, and a linear model line.

library(ggplot2)

# Plot with 1:1 line and regression line
ggplot(comparison_tbl, aes(x = observed_cfs, y = predicted_cfs)) +
  geom_point(size = 2, color = "steelblue") +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray") +
  geom_smooth(method = "lm", se = FALSE, color = "darkred") +
  labs(title = "Predicted vs Observed Streamflow (2024)",
       x = "Observed Flow (cfs)", y = "Predicted Flow (cfs)") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'